{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bert import tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('../bahasa/dataset-en-to-ms.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "train_X = data['train_X']\n",
    "train_Y = data['train_Y']\n",
    "test_X = data['test_X']\n",
    "test_Y = data['test_Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "BERT_VOCAB = '../multi_cased_L-12_H-768_A-12/vocab.txt'\n",
    "tokenizer = tokenization.FullTokenizer(\n",
    "      vocab_file=BERT_VOCAB, do_lower_case=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "EOS = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "from tqdm import tqdm\n",
    "import collections\n",
    "import tensorflow as tf\n",
    "maxlen = 256\n",
    "\n",
    "def create_int_feature(values):\n",
    "    feature = tf.train.Feature(\n",
    "        int64_list = tf.train.Int64List(value = list(values))\n",
    "    )\n",
    "    return feature\n",
    "\n",
    "def get_inputs(x, y, index, prefix = 'train'):\n",
    "    input_ids, input_masks, segment_ids, ys = [], [], [], []\n",
    "    for i in tqdm(range(len(x))):\n",
    "        tokens_a = tokenizer.tokenize(unidecode(x[i]))\n",
    "        tokens_b = tokenizer.tokenize(unidecode(y[i]))\n",
    "        tokens_a = tokens_a[:maxlen - 2]\n",
    "        tokens_b = tokens_b[:maxlen - 1]\n",
    "        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
    "        \n",
    "        segment_id = [0] * len(tokens)\n",
    "        input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
    "        input_mask = [1] * len(input_id)\n",
    "        \n",
    "        input_id = input_id + [0] * (maxlen - len(input_id))\n",
    "        segment_id = segment_id + [0] * (maxlen - len(segment_id))\n",
    "        input_mask = input_mask + [0] * (maxlen - len(input_mask))\n",
    "\n",
    "        input_ids.append(input_id)\n",
    "        input_masks.append(input_mask)\n",
    "        segment_ids.append(segment_id)\n",
    "        \n",
    "        r = tokenizer.convert_tokens_to_ids(tokens_b) + [EOS]\n",
    "        if len([k for k in r if k == 0]):\n",
    "            print(y[i], i)\n",
    "            break\n",
    "            \n",
    "        r = r + [0] * (maxlen - len(r))\n",
    "        ys.append(r)\n",
    "    \n",
    "    r = tf.python_io.TFRecordWriter(f'multilanguagebert-{prefix}-{index}.tfrecord')\n",
    "    for i in tqdm(range(len(ys))):\n",
    "        features = collections.OrderedDict()\n",
    "        features['input_ids'] = create_int_feature(input_ids[i])\n",
    "        features['input_mask'] = create_int_feature(input_masks[i])\n",
    "        features['segment_ids'] = create_int_feature(segment_ids[i])\n",
    "        features['y'] = create_int_feature(ys[i])\n",
    "        tf_example = tf.train.Example(\n",
    "            features = tf.train.Features(feature = features)\n",
    "        )\n",
    "        r.write(tf_example.SerializeToString())\n",
    "    r.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunks_multiple(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        x, y = list(zip(*l[i : i + n]))\n",
    "        yield (x, y, i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import multi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 99%|█████████▉| 269276/272587 [04:47<00:15, 210.74it/s] \n",
      "100%|█████████▉| 272068/272587 [04:52<00:02, 241.32it/s]\n",
      " 98%|█████████▊| 268129/272587 [04:51<00:24, 183.84it/s]\n",
      " 98%|█████████▊| 268121/272587 [04:51<00:21, 210.51it/s]\n",
      " 99%|█████████▉| 269679/272587 [04:56<00:11, 254.66it/s]\n",
      "100%|█████████▉| 272038/272587 [04:59<00:01, 297.78it/s]\n",
      "  3%|▎         | 7369/272587 [00:08<04:16, 1032.88it/s]]\n",
      "  1%|          | 1790/272587 [00:01<04:09, 1083.86it/s]]\n",
      "  4%|▍         | 11641/272587 [00:13<04:00, 1084.73it/s]\n",
      "100%|██████████| 272587/272587 [05:04<00:00, 895.30it/s]\n",
      "  5%|▍         | 12298/272587 [00:14<07:36, 570.16it/s]]\n",
      "  5%|▌         | 13664/272587 [00:15<05:13, 825.19it/s]]\n",
      "100%|██████████| 272587/272587 [03:06<00:00, 1458.63it/s]\n",
      "100%|██████████| 272587/272587 [03:11<00:00, 1419.92it/s]\n",
      "100%|██████████| 272587/272587 [03:06<00:00, 1463.97it/s]\n",
      " 96%|█████████▌| 261471/272587 [02:54<00:04, 2664.67it/s]\n",
      "100%|██████████| 6/6 [00:00<00:00, 697.85it/s]128.19it/s]\n",
      "100%|██████████| 6/6 [00:00<00:00, 1958.58it/s]13.99it/s]\n",
      "100%|██████████| 272587/272587 [03:04<00:00, 1480.96it/s]\n",
      "100%|██████████| 272587/272587 [03:00<00:00, 1508.79it/s]\n",
      "100%|██████████| 272587/272587 [02:57<00:00, 1532.99it/s]\n",
      "100%|██████████| 272587/272587 [03:02<00:00, 1497.39it/s]\n",
      "100%|██████████| 272587/272587 [03:04<00:00, 1480.57it/s]\n",
      "100%|██████████| 272587/272587 [02:59<00:00, 1519.42it/s]\n",
      "100%|██████████| 272587/272587 [03:00<00:00, 1514.14it/s]\n",
      "100%|██████████| 272587/272587 [02:57<00:00, 1539.36it/s]\n"
     ]
    }
   ],
   "source": [
    "multi.multiprocessing(chunks_multiple(list(zip(train_X, train_Y)), len(train_X) // 12),\n",
    "                     get_inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunks_multiple(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        x, y = list(zip(*l[i : i + n]))\n",
    "        yield (x, y, i, 'test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5563/5563 [00:04<00:00, 1229.64it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1203.67it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1201.45it/s]\n",
      "  4%|▍         | 227/5563 [00:00<00:02, 2263.69it/s]]\n",
      " 12%|█▏        | 640/5563 [00:00<00:02, 2092.59it/s]]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1147.49it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1145.38it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1151.62it/s]\n",
      "  0%|          | 0/5563 [00:00<?, ?it/s]2140.74it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1139.78it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1130.13it/s]\n",
      "100%|██████████| 5563/5563 [00:04<00:00, 1115.46it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2139.11it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 1281.88it/s]7it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 1252.78it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2010.85it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 1898.03it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2089.37it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2010.86it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2090.55it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 2118.43it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 1992.12it/s]\n",
      " 97%|█████████▋| 5388/5563 [00:02<00:00, 2018.99it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 1997.66it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 1938.20it/s]\n",
      "100%|██████████| 5563/5563 [00:02<00:00, 1948.85it/s]\n"
     ]
    }
   ],
   "source": [
    "multi.multiprocessing(chunks_multiple(list(zip(test_X, test_Y)), len(test_X) // 12),\n",
    "                     get_inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
